Price Histograms with Facet and Color

Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.

ggplot(data = diamonds, aes(price)) +
   geom_histogram(aes(color=cut), binwidth = 100) +
   facet_wrap(~ color) +
   scale_fill_brewer(type = 'qual')

Price vs. Table Colored by Cut

Create a scatterplot of diamond price vs. table and color the points by the cut of the diamond.

ggplot(data = diamonds, aes(x = table, y = price)) +
   geom_point(aes(color=cut)) +
   scale_x_continuous(breaks = seq(50,70,1)) +
   coord_cartesian(xlim = c(50, 70))

Price vs. Volume and Diamond Clarity

Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by the clarity of diamonds. Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.

diamonds$volume <- with(diamonds, x * y * z)
vol99q <- with(diamonds, quantile(volume, probs = c(.99)))
ggplot(data = subset(diamonds, volume <= vol99q),
   aes(x = volume, y = price)) +
   geom_point(alpha = 1/5, aes(color=clarity)) +
   scale_y_log10()

Proportion of Friendships Initiated

Your task is to create a new variable called ‘prop_initiated’ in the Pseudo-Facebook data set. The variable should contain the proportion of friendships that the user initiated.

pf <- read.csv("pseudo_facebook.tsv", sep = '\t')

pf$prop_initiated <- with(pf, ifelse(friend_count > 0, friendships_initiated / friend_count, 0))

prop_initiated vs. tenure

Create a line graph of the median proportion of friendships initiated (‘prop_initiated’) vs. tenure and color the line segment by year_joined.bucket.

pf$year_joined <- with(pf, floor(2014 - tenure / 365))
pf$year_joined.bucket <- with(pf, cut(year_joined, breaks = c(2004,2009,2011,2012,2014)))

ggplot(data = subset(pf, prop_initiated > 0), aes(x = tenure, y = prop_initiated)) +
   geom_line(aes(color=year_joined.bucket), stat='summary', fun.y = median)
## Warning: Removed 2 rows containing non-finite values (stat_summary).

Smoothing prop_initiated vs. tenure

Smooth the last plot you created of prop_initiated vs tenure colored by year_joined.bucket.

ggplot(data = subset(pf, prop_initiated > 0), aes(x = tenure, y = prop_initiated)) +
   geom_line(aes(color=year_joined.bucket), stat='summary', fun.y = median) +
   geom_smooth()
## Warning: Removed 2 rows containing non-finite values (stat_summary).
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

prop_initiated.2012.2014 <- pf %>% filter(year_joined.bucket == "(2012,2014]")
mean(prop_initiated.2012.2014$prop_initiated)
## [1] 0.6430155

Price/Carat Binned, Faceted, & Colored

Create a scatter plot of the price/carat ratio of diamonds. The variable x should be assigned to cut. The points should be colored by diamond color, and the plot should be faceted by clarity.

ggplot(data = subset(diamonds, carat > 0),
   aes(x = cut, y = price / carat)) +
   geom_jitter(aes(color = color)) + 
   facet_wrap(~ clarity) +
   scale_color_brewer(type = "div")

Gapminder Multivariate Analysis

Load and tidy the data

library(tidyr)
res.e.usage.pp <- read.csv("Indicator_Residential electricity consumption per person.csv",
   row.names = NULL, stringsAsFactors = FALSE)

e.gen.pp <- read.csv("Electricity Generation per capita.csv",
   row.names = NULL, stringsAsFactors = FALSE)

e.nuclear.pp <- read.csv("Indicator_Nuclear production per capita.csv",
   row.names = NULL, stringsAsFactors = FALSE)

countryList <- intersect(res.e.usage.pp$Country, e.gen.pp$Country)
countryList <- intersect(countryList, e.nuclear.pp$Country)

res.e.usage.pp.tidy <- res.e.usage.pp %>% 
   filter(Country %in% countryList) %>%
   gather("Year", "kwH.residential", 2:50)
res.e.usage.pp.tidy$Year <- substr(res.e.usage.pp.tidy$Year, 2, 5)

e.gen.pp.tidy <- e.gen.pp %>%
   filter(Country %in% countryList) %>%
   gather("Year", "kwH.generated", 2:20)
e.gen.pp.tidy$Year <- substr(e.gen.pp.tidy$Year, 2, 5)

e.nuclear.pp.tidy <- e.nuclear.pp %>%
   filter(Country %in% countryList) %>%
   gather("Year", "kwH.nuclear", 2:53)
e.nuclear.pp.tidy$Year <- substr(e.nuclear.pp.tidy$Year, 2, 5)

e.all.tidy <- left_join(res.e.usage.pp.tidy, e.gen.pp.tidy) %>%
   left_join(., e.nuclear.pp.tidy) %>%
   filter(!(is.na(kwH.residential) | is.na(kwH.generated) | is.na(kwH.nuclear)))
## Joining by: c("Country", "Year")
## Joining by: c("Country", "Year")

Make Plots

ggplot(data = e.all.tidy, aes(x = Year, y = kwH.residential)) +
   geom_point(aes(color = kwH.generated)) +
   facet_wrap(~ Country)

ggplot(data = e.all.tidy, aes(x = Year, y = kwH.residential)) +
   geom_point(aes(color = kwH.nuclear)) +
   facet_wrap(~ Country)